Problem Description

In this project, the main goal is to forecast the sales of 9 products for the dates between 01.06.2021 and 27.06.2021. Forecasting sales in online retail problem is very challenging due to randomness and depending on many factors.

There are 9 product types. These are:

The data include the following features:

Using the necessary features, the number of total sale of product in a day will be forecasted.

Approach

Note: The necessary functions, libraries, and the data are imported.

library(dplyr)
library(tidyr)
library(data.table)
library(lubridate)
library(ggplot2)
library(GGally)
library(urca)
library(forecast)
library(rjson)
library(zoo)
library(caret)
library(ggcorrplot)

read_data <- function(path, shift = T){
  setwd(path)
  data <- read.csv("ProjectRawData.csv")
  data$event_date = ymd(data$event_date)
  data <- data[!(is.na(data$event_date)) | !(is.na(data$product_content_id)),]
  data <- data.table(data)
  
  result <- fromJSON(file = "indir.json")
  table <- data.frame()
  for (i in 1:length(result)){
    a <- as.data.frame(result[i])
    table <- rbind(table, a)
  }
  
  table <- data.table(table[,c(2,3,1,4,5,7,6,8,10,12,13,9,11)])
  table$event_date <- as.Date(table$event_date)
  table <- table[event_date>="2021-05-29",]
  new_table_len <- nrow(table)
  current_data <- rbind(table,data)
  
  if(!shift){
    return(current_data)
  }
  else{
    data_corrected <- current_data[1:new_table_len,c(1:7,12,8:11,13)]
    colnames(data_corrected) <- colnames(current_data)
    data_deficit <- current_data[-(1:new_table_len),]
    data_corrected <- data.table(rbind(data_corrected, data_deficit))
    data_corrected[price<0,price:=NA]
    return(data_corrected)
  }
}

visit_count_calc <- function(data_prod){
  data_prod <- data_prod[,var:=basket_count-mean(basket_count)]
  train <- data_prod[data_prod$event_date>"2021-01-29"]
  test <- data_prod[data_prod$event_date<="2021-01-29" & data_prod$event_date>"2020-12-10"]
  model <- lm(visit_count~var, train)
  cv <- as.data.frame(test$var)
  colnames(cv) <- "var"
  pred <- predict(model,cv)
  test$visit_count <- round(pred)
  
  test_train <- rbind(train,test)
  
  train <- test_train[test_train$event_date>"2020-12-10"]
  test <- data_prod[data_prod$event_date<="2020-12-10" & data_prod$event_date>"2020-10-10"]
  model <- lm(visit_count~var, train)
  cv <- as.data.frame(test$var)
  colnames(cv) <- "var"
  pred <- predict(model,cv)
  test$visit_count <- round(pred)
  test_train <- rbind(train,test)
  
  train <- test_train[test_train$event_date>"2020-10-10"]
  test <- data_prod[data_prod$event_date<="2020-10-10" & data_prod$event_date>"2020-08-01"]
  model <- lm(visit_count~var, train)
  cv <- as.data.frame(test$var)
  colnames(cv) <- "var"
  pred <- predict(model,cv)
  test$visit_count <- round(pred)
  test_train <- rbind(train,test)
  
  train <- test_train[test_train$event_date>"2020-08-01"]
  test <- data_prod[data_prod$event_date<="2020-08-01" & data_prod$event_date>="2020-05-25"]
  model <- lm(visit_count~var, train)
  cv <- as.data.frame(test$var)
  colnames(cv) <- "var"
  pred <- predict(model,cv)
  test$visit_count <- round(pred)
  test_train <- rbind(train,test)
  
  data_prod$visit_count <- test_train$visit_count
  data_prod$visit_count <- ifelse(data_prod$visit_count <= 0,0,data_prod$visit_count)
  return(data_prod)
}

favored_count_calc <- function(data_prod){
  data_prod <- data_prod[,var:=basket_count-mean(basket_count)]
  train <- data_prod[data_prod$event_date>"2021-01-29"]
  test <- data_prod[data_prod$event_date<="2021-01-29" & data_prod$event_date>"2020-12-10"]
  model <- lm(favored_count~var, train)
  cv <- as.data.frame(test$var)
  colnames(cv) <- "var"
  pred <- predict(model,cv)
  test$favored_count <- round(pred)
  
  test_train <- rbind(train,test)
  
  train <- test_train[test_train$event_date>"2020-12-10"]
  test <- data_prod[data_prod$event_date<="2020-12-10" & data_prod$event_date>"2020-10-10"]
  model <- lm(favored_count~var, train)
  cv <- as.data.frame(test$var)
  colnames(cv) <- "var"
  pred <- predict(model,cv)
  test$favored_count <- round(pred)
  test_train <- rbind(train,test)
  
  train <- test_train[test_train$event_date>"2020-10-10"]
  test <- data_prod[data_prod$event_date<="2020-10-10" & data_prod$event_date>"2020-08-01"]
  model <- lm(favored_count~var, train)
  cv <- as.data.frame(test$var)
  colnames(cv) <- "var"
  pred <- predict(model,cv)
  test$favored_count <- round(pred)
  test_train <- rbind(train,test)
  
  train <- test_train[test_train$event_date>"2020-08-01"]
  test <- data_prod[data_prod$event_date<="2020-08-01" & data_prod$event_date>="2020-05-25"]
  model <- lm(favored_count~var, train)
  cv <- as.data.frame(test$var)
  colnames(cv) <- "var"
  pred <- predict(model,cv)
  test$favored_count <- round(pred)
  test_train <- rbind(train,test)
  
  data_prod$favored_count <- test_train$favored_count
  data_prod$favored_count <- ifelse(data_prod$favored_count <= 0,0,data_prod$favored_count)
  return(data_prod)
}

category_basket_calc <- function(data_prod){
  data_prod <- data_prod[,var:=category_favored-mean(category_favored)]
  train <- data_prod[data_prod$event_date>"2021-01-29"]
  test <- data_prod[data_prod$event_date<="2021-01-29" & data_prod$event_date>"2020-12-10"]
  model <- lm(category_basket~var, train)
  cv <- as.data.frame(test$var)
  colnames(cv) <- "var"
  pred <- predict(model,cv)
  test$category_basket <- round(pred)
  
  test_train <- rbind(train,test)
  
  train <- test_train[test_train$event_date>"2020-12-10"]
  test <- data_prod[data_prod$event_date<="2020-12-10" & data_prod$event_date>"2020-10-10"]
  model <- lm(category_basket~var, train)
  cv <- as.data.frame(test$var)
  colnames(cv) <- "var"
  pred <- predict(model,cv)
  test$category_basket <- round(pred)
  test_train <- rbind(train,test)
  
  train <- test_train[test_train$event_date>"2020-10-10"]
  test <- data_prod[data_prod$event_date<="2020-10-10" & data_prod$event_date>"2020-08-01"]
  model <- lm(category_basket~var, train)
  cv <- as.data.frame(test$var)
  colnames(cv) <- "var"
  pred <- predict(model,cv)
  test$category_basket <- round(pred)
  test_train <- rbind(train,test)
  
  train <- test_train[test_train$event_date>"2020-08-01"]
  test <- data_prod[data_prod$event_date<="2020-08-01" & data_prod$event_date>="2020-05-25"]
  model <- lm(category_basket~var, train)
  cv <- as.data.frame(test$var)
  colnames(cv) <- "var"
  pred <- predict(model,cv)
  test$category_basket <- round(pred)
  test_train <- rbind(train,test)
  
  data_prod$category_basket <- test_train$category_basket
  return(data_prod)
}

category_brand_sold_calc <- function(data_prod){
  data_prod <- data_prod[,var:=category_sold-mean(category_sold)]
  train <- data_prod[data_prod$event_date>"2021-01-29"]
  test <- data_prod[data_prod$event_date<="2021-01-29" & data_prod$event_date>"2020-12-10"]
  model <- lm(category_brand_sold~var, train)
  cv <- as.data.frame(test$var)
  colnames(cv) <- "var"
  pred <- predict(model,cv)
  test$category_brand_sold <- round(pred)
  
  test_train <- rbind(train,test)
  
  train <- test_train[test_train$event_date>"2020-12-10"]
  test <- data_prod[data_prod$event_date<="2020-12-10" & data_prod$event_date>"2020-10-10"]
  model <- lm(category_brand_sold~var, train)
  cv <- as.data.frame(test$var)
  colnames(cv) <- "var"
  pred <- predict(model,cv)
  test$category_brand_sold <- round(pred)
  test_train <- rbind(train,test)
  
  train <- test_train[test_train$event_date>"2020-10-10"]
  test <- data_prod[data_prod$event_date<="2020-10-10" & data_prod$event_date>"2020-08-01"]
  model <- lm(category_brand_sold~var, train)
  cv <- as.data.frame(test$var)
  colnames(cv) <- "var"
  pred <- predict(model,cv)
  test$category_brand_sold <- round(pred)
  test_train <- rbind(train,test)
  
  train <- test_train[test_train$event_date>"2020-08-01"]
  test <- data_prod[data_prod$event_date<="2020-08-01" & data_prod$event_date>="2020-05-25"]
  model <- lm(category_brand_sold~var, train)
  cv <- as.data.frame(test$var)
  colnames(cv) <- "var"
  pred <- predict(model,cv)
  test$category_brand_sold <- round(pred)
  test_train <- rbind(train,test)
  
  data_prod$category_brand_sold <- test_train$category_brand_sold
  return(data_prod)
}

ty_visits_calc <- function(data_prod){
  data_prod <- data_prod[,var:=category_sold-mean(category_sold)]
  train <- data_prod[data_prod$event_date>"2021-01-29"]
  test <- data_prod[data_prod$event_date<="2021-01-29" & data_prod$event_date>"2020-12-10"]
  model <- lm(ty_visits~var, train)
  cv <- as.data.frame(test$var)
  colnames(cv) <- "var"
  pred <- predict(model,cv)
  test$ty_visits <- round(pred)
  
  test_train <- rbind(train,test)
  
  train <- test_train[test_train$event_date>"2020-12-10"]
  test <- data_prod[data_prod$event_date<="2020-12-10" & data_prod$event_date>"2020-10-10"]
  model <- lm(ty_visits~var, train)
  cv <- as.data.frame(test$var)
  colnames(cv) <- "var"
  pred <- predict(model,cv)
  test$ty_visits <- round(pred)
  test_train <- rbind(train,test)
  
  train <- test_train[test_train$event_date>"2020-10-10"]
  test <- data_prod[data_prod$event_date<="2020-10-10" & data_prod$event_date>"2020-08-01"]
  model <- lm(ty_visits~var, train)
  cv <- as.data.frame(test$var)
  colnames(cv) <- "var"
  pred <- predict(model,cv)
  test$ty_visits <- round(pred)
  test_train <- rbind(train,test)
  
  train <- test_train[test_train$event_date>"2020-08-01"]
  test <- data_prod[data_prod$event_date<="2020-08-01" & data_prod$event_date>="2020-05-25"]
  model <- lm(ty_visits~var, train)
  cv <- as.data.frame(test$var)
  colnames(cv) <- "var"
  pred <- predict(model,cv)
  test$ty_visits <- round(pred)
  test_train <- rbind(train,test)
  
  data_prod$ty_visits <- test_train$ty_visits
  return(data_prod)
}

data_manip <- function(product_id, normal = T, discount = T, shift = T){
  
  data_prod = data[data$product_content_id == product_id,]
  
  data_prod <- visit_count_calc(data_prod)
  
  data_prod <- favored_count_calc(data_prod)
  
  data_prod <- category_basket_calc(data_prod)
  
  data_prod <- category_brand_sold_calc(data_prod)
  
  data_prod <- ty_visits_calc(data_prod)
  
  data_prod <- data_prod %>% fill(price, .direction = "up")
  
  data_prod <- data_prod %>% fill(price, .direction = "down")
  
  data_prod$product_content_id <- NULL
  
  data_prod$var <- NULL
  
  data_prod <- arrange(data_prod, event_date)
  
  
  if(discount){
    data_prod[,lag_1:=shift(price)]
    
    data_prod[,discount := (lag_1-price)]
    
    data_prod$discount <- na.fill(data_prod$discount, 0)
  }
  
  if(normal){
    preproc <- preProcess(data_prod[,c(4:12)], method=c("center", "scale"))
    
    new_cols <- predict(preproc, data_prod[,c(4:12)])
    
    data_prod[,c(4:12)] <- new_cols
  }
  
  data_prod$lag_1 <- NULL
  
  return(data_prod)
}

accu=function(actual,forecast){
  n=length(actual)
  error=actual-forecast
  mean=mean(actual)
  sd=sd(actual)
  CV=sd/mean
  FBias=sum(error)/sum(actual)
  MAPE=sum(abs(error/actual))/n
  RMSE=sqrt(sum(error^2)/n)
  MAD=sum(abs(error))/n
  MADP=sum(abs(error))/sum(abs(actual))
  WMAPE=MAD/mean
  l=data.frame(n,mean,sd,CV,FBias,MAPE,RMSE,MAD,MADP,WMAPE)
  return(l)
}

forecast_with_arima=function(data,forecast_ahead,target_name='sold_count',
                             is_seasonal=F,is_stepwise=F,is_trace=T,is_approx=F, xreg1 = NULL){
  command_string=sprintf('input_series=data$%s',target_name)
  print(command_string)
  eval(parse(text=command_string))
  
  fitted=auto.arima(input_series,seasonal=is_seasonal,
                    trace=is_trace,stepwise=is_stepwise,approximation=is_approx, xreg = xreg1)
  
  forecasted=predict(fitted,n.ahead=forecast_ahead, newxreg = tail(xreg1,forecast_ahead))$pred
  return(list(forecast=forecasted,model=fitted))
}

forecast_with_arima_extended=function(data,forecast_ahead,target_name='sold_count',
                                      is_seasonal=F,is_stepwise=F,is_trace=T,is_approx=F, 
                                      seasonality_period=NULL,fitted_model=NULL, xreg1 = NULL, decomposed = NULL){
  
  if(is_seasonal & !is.null(seasonality_period)){
    command_string=sprintf('input_series=ts(data$%s,freq=%d)',target_name,seasonality_period)
    
  } else {
    command_string=sprintf('input_series=data$%s',target_name)
  }
  print(command_string)
  eval(parse(text=command_string))
  
  if(!is.null(decomposed)){
    input_series_decomposed=decompose(input_series,type = "additive")
    random <- input_series_decomposed$random
  }
  
  if(is.null(fitted_model)){
    if(!is.null(decomposed)){
      fitted=auto.arima(random,seasonal=is_seasonal,
                      trace=is_trace,stepwise=is_stepwise,approximation=is_approx, xreg = xreg1)
    }
    else{
      fitted=auto.arima(input_series,seasonal=is_seasonal,
                        trace=is_trace,stepwise=is_stepwise,approximation=is_approx, xreg = xreg1)
    }
  } else {
    fitted=Arima(random, model=fitted_model)
  }
  
  if(is.null(xreg1)){
    forecasted=predict(fitted,n.ahead=forecast_ahead)$pred
    if(!is.null(decomposed)){
      forecasted = forecasted + input_series_decomposed$seasonal[(length(ts) %% 7 +1) : (length(ts) %% 7 + forecast_ahead)] + tail(input_series_decomposed$trend[!is.na(input_series_decomposed$trend)],1)
    }
  }
  else{
    forecasted=predict(fitted,n.ahead=forecast_ahead, newxreg = tail(xreg1,forecast_ahead))$pred
    if(!is.null(decomposed)){
      forecasted = forecasted + input_series_decomposed$seasonal[(length(ts) %% 7 +1) : (length(ts) %% 7 + forecast_ahead)] + tail(input_series_decomposed$trend[!is.na(input_series_decomposed$trend)],1)
    }
  }
  
  return(list(forecast=forecasted,model=fitted))
}

plotting <- function(data, product_name, col, num = 1){
  if(num == 1){
  ggplot(data) + geom_line(aes(x = event_date, y = sold_count), color = col) + 
    labs(x="Date", y="The Number of Sold", title = paste0("Sales(", product_name,")")) +
    theme_minimal()
  }
  else{
    ggplot(data) + geom_line(aes(x = event_date, y = sold_count, color = "Actual")) + 
      geom_line(aes(x=event_date, y = prediction, color = "Prediction" ))+
      labs(x="Date", y="Sales", title = paste0("Comparison of Sales and Predictions for ", product_name)) +
      theme_minimal()
  }
}

cor_plot <- function(data, product_name){
  head(data)
  corr <- round(cor(data[,c(2:13)]), 2)
  
  ggcorrplot(corr, hc.order = TRUE, 
             type = "lower", 
             lab = TRUE, 
             lab_size = 3, 
             method="circle", 
             colors = c("tomato2", "white", "springgreen3"), 
             title=paste0("Correlogram of ", product_name), 
             ggtheme=theme_bw)
}

forecast_func <- function(data, reg_matrix){
  test <- data[event_date>"2021-05-31"]
  test_dates <- test$event_date
  results=vector('list',length(test_dates))
  for(i in 1:length(unique(test$event_date))){
    current_date=test_dates[i]-2
    past_data=data[event_date<=current_date]
    reg_matrix_past <- reg_matrix[1:nrow(past_data),1]
    forecast_data=data.table(data[event_date==test_dates[i]])
    model <- forecast_with_arima_extended(data = past_data, forecast_ahead = 2, is_seasonal = T,
                                          seasonality_period = 7, is_trace = F,
                                          xreg1 = reg_matrix_past)
    forecasted=model$forecast 
    forecasted = ifelse(forecasted<0,0,forecasted)
    forecast_data[,prediction:=forecasted[2]]
    results[[i]]=forecast_data
    print(i)
  }
  results <- rbindlist(results)
  return(results[,c("event_date","sold_count", "prediction")])
}

data <- read_data("/Users/ozgurcan/Archive/PDFLibrary/IE360/IE360ClassProject", shift = F)

When the data are analyzed by hand, it is realized that some features (visit count, favored count, category basket, category brand sold, trendyol visits) do not have any data before 30.01.2021. These are visualized below.

ggplot(data[product_content_id == "85004",],aes(x=event_date))+
  geom_line(aes(y=category_basket), color="black")+
  labs(x="Date",y="The number of adding the same category products into the basket", title="Category Basket")+
  theme_minimal()

It can be seen that the the large portion of data is zero for this feature. There are 4 more features having the same issue. Discarding this part of the data would not be a good option because the model would not be good to catch the seasonalities. Therefore, past data are predicted by using linear regression with a variable in the data. For instance, category brand sold is estimated by using category sold. The features are selected in a logical way. However, this way increases the multicollinearity (structural), which may affect the model performance. Thus, the independent variable in the linear regression is (x-x_bar). You can reach this way from this link. The implementation can be found in the functions above.

The new feature is plotted below.

data_85004 <- data_manip("85004", normal = F, shift = F)
ggplot(data_85004,aes(x=event_date))+
  geom_line(aes(y=category_basket), color="black")+
  labs(x="Date",y="The number of adding the same category products into the basket", title="Category Basket")+
  theme_minimal()

It can be said that the data before 30.01.2021 seem to be distributed randomly.

It is obvious that after 28.05.2021, the number of adding the product in the same category into the basket has decreased very significantly, which seems not normal. After visual inspection, it is realized that every column is shifted by one. The corrected version is below.

data <- read_data("/Users/ozgurcan/Archive/PDFLibrary/IE360/IE360ClassProject", shift = T)
data_85004 <- data_manip("85004", normal = F, shift = T)
ggplot(data_85004,aes(x=event_date))+
  geom_line(aes(y=category_basket), color="black")+
  labs(x="Date",y="The number of adding the same category products into the basket", title="Category Basket")+
  theme_minimal()

Although there is a downward trend after this date, the scale shows that the assumption made above seems correct.

In addition, daily price changes are added into the dataset. This way may help to catch sudden increases in sales due to discounts. Special days may be helpful as well, but since not all products’ prices decrease at the same time, the price difference should be more robust.

Some features have large values, such as the total number of visits of Trendyol. These have to be normalized in order to improve the model performance. Scale differences may affect the effectiveness of the features. Therefore, necessary features are normalized. They are embedded into the model, which means the data in this report are displayed without normalization.

After some trials, it is observed that adding seasonality into ARIMA model (SARIMA) improves the model significantly. Therefore, SARIMA model will be used. In addition, since the number of data points is not large, auto.arima function will be used.

Firstly, the data need to be analyzed for each product. Since the forecasting process take too much time, they will be imported when they will be analyzed. The forecast code can be found above.

Results

1) Dik Süpürge

For the product with id 7061886, which is “Dik Süpürge”, we’ve applied multiple feature engineering methods to overcome the insufficient or misleading data in several features. After that, we’ve established a stable version of our dataset. Even if it is patched with its own data, we’ve been compliant with the structure of the data itself and as explained above, we’ve tried to outmanuever the effects of multicollinearity.

data_7061886 <- data_manip("7061886", normal = F) # Dik Süpürge, for demonstration purposes only.
head(data_7061886)
##    event_date    price sold_count visit_count basket_count favored_count
## 1: 2020-05-25 238.8718         39        2471          136           236
## 2: 2020-05-26 237.8758         33        2139          117           205
## 3: 2020-05-27 240.6333         36        2541          140           243
## 4: 2020-05-28 236.8132         38        2751          152           262
## 5: 2020-05-29 232.9308         39        2227          122           213
## 6: 2020-05-30 229.2183         71        3993          223           379
##    category_sold category_visits category_basket category_favored
## 1:           102             520           56048             2171
## 2:           110             508           55097             2118
## 3:           129             510           51490             1917
## 4:           140             529           54756             2099
## 5:           118             489           52369             1966
## 6:           162             608           57322             2242
##    category_brand_sold ty_visits   discount
## 1:                6727 105692032  0.0000000
## 2:                6956 107205276  0.9960373
## 3:                7498 110799229 -2.7575757
## 4:                7813 112879939  3.8201754
## 5:                7184 108718519  3.8823887
## 6:                8441 117041358  3.7124593
data_7061886 <- data_manip("7061886", normal = T) # Dik Süpürge

Then, we’ve plotted the sold count of the “Dik Süpürge” over time and it shows that aside from several special days, the number of sold hand vacuums have shown a nearly constant variance and linear trend. Therefore, we can say that the data set do not require more improvements, and it is suitable for a model development.

plotting(data_7061886, "Dik Süpürge", "blue")

Hence, we are ready to develop our model. We’d like to utilize the power and reliability of SARIMA models with external regressors, thus let’s examine the correlation graph of our dataset.

cor_plot(data_7061886, "Dik Süpürge")

It shows that sold_count, which is our forecast target feature, is highly correlated with visit count and basket count. It is possible to have these two as our external regressors, but the correlation between visit count and basket count are 1, which is an indicator of total correlation. Therefore, it is unnecessary to have these two regressors as our external regressors, hence we’ve chosen only the “visit_count” as our regressor to aid the SARIMA model to forecast the desired interval. Also, from the logical approach, we can say that if a person visits a “Dik Süpürge”, they have mostly planned themselves to buy the “Dik Süpürge”. Since it is not a daily need like bread or water, the people who actually needed it demand the product, hence they visit Trendyol and possibly this product.

forecast_7061886 = forecast_func(data_7061886, reg_matrix = cbind(data_7061886$visit_count))

After the forecasting, it is time to compare our predictions with the actual values. The plot below shows both predicted and actual values. Predicted values are plotted as in black and the actual values are represented as in blue. We can say that the predictions are not that far off from its actual values, but it can be improved.

load("forecast_7061886.RData")
plotting(forecast_7061886, "Dik Süpürge", "blue", num = 2)

Finally, let’s consider the metrics of our model. We’ve stated many metrics below and we mostly take “WMAPE” into consideration for simplicity. And our WMAPE score is 0.39, which indicates that we’ve created a sufficient model that understands the general direction of the time series, but it needs many enhancements to reach its final state.

accu(forecast_7061886$sold_count, forecast_7061886$prediction)
##    n     mean       sd        CV     FBias      MAPE     RMSE      MAD
## 1 27 15.96296 7.856802 0.4921895 0.2043776 0.4546801 7.735164 6.268343
##        MADP     WMAPE
## 1 0.3926804 0.3926804

2) Bluetooth Kulaklık

The data is imported.

data_6676673 <- data_manip("6676673", normal = F)#Bikini Üstü
head(data_6676673)
##    event_date    price sold_count visit_count basket_count favored_count
## 1: 2020-05-25 137.6873        539       21850         1783          1443
## 2: 2020-05-26 134.4552        724       25401         2106          1721
## 3: 2020-05-27 136.8115        763       27380         2286          1876
## 4: 2020-05-28 138.8387        597       22004         1797          1455
## 5: 2020-05-29 136.1015        695       22675         1858          1508
## 6: 2020-05-30 134.7339        587       21608         1761          1424
##    category_sold category_visits category_basket category_favored
## 1:           674           10271          319887            25365
## 2:           882            4546          223382            16909
## 3:           944            4346          198549            14733
## 4:           758            4371          207371            15506
## 5:           860            3833          180049            13112
## 6:           756            3456          176009            12758
##    category_brand_sold ty_visits  discount
## 1:               20525 109186825  0.000000
## 2:               22745 113831777  3.232121
## 3:               23407 115216330 -2.356326
## 4:               21422 111062671 -2.027127
## 5:               22510 113340484  2.737164
## 6:               21400 111018008  1.367595

Now, the number of sales will be displayed.

data_6676673 <- data_manip("6676673")#Bikini Üstü
plotting(data_6676673, "Bluetooth Kulaklık", "navyblue")

The number of sales seems to be distributed randomly. Like the other products, there are some outliers, probably in special days. There is no obvious trend, or seasonality.

The correlation matrix will be demonstrated below.

cor_plot(data_6676673, "Bluetooth Kulaklık")
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.

It can be seen that the number of adding the product into the basket has the highest correlation with the number of sales, which is expected because if a “Bluetooth Kulaklık” is added into the basket, it should be bought generally because it is not bought frequently.

In addition, the price has a strong negative correlation with the sales. People may wait for “Bluetooth Kulaklık” getting discount during special days, such as Black Friday etc. As a result, price will be added as a feature in addition to basket_count feature.

The model is below.

forecast_6676673 = forecast_func(data_6676673, cbind(data_6676673$basket_count, data_6676673$price))

The comparison is below.

load("forecast_6676673")

plotting(forecast_6676673, "Bluetooth Kulaklık", "navyblue", num = 2)

Sudden changes in sales cannot be easily predicted by the model. Close predictions are observed from the graph. General performance of the model seems good.

The error rates are below.

accu(forecast_6676673$sold_count, forecast_6676673$prediction)
##    n     mean       sd        CV      FBias      MAPE     RMSE      MAD
## 1 27 480.1481 122.0762 0.2542469 0.03710542 0.2459352 127.4648 101.8843
##        MADP     WMAPE
## 1 0.2121936 0.2121936

WMAPE is low, which means the model has a great performance. Although the standard deviation is quite high, the model performs well.

3) Bebek İslak Mendil

For the product with id 4066298, which is “Bebek Islak Mendil”, we’ve inspected the dataset and found out that data itself has no significant errors or many missing data values. However, it is essential to recover the state of the dataset to its greatest form, thus we’ve applied several feature engineering methods to ensure the integrity of the dataset. Below, you can see several examples of the data set.

data_4066298 <- data_manip("4066298", normal = F) # Bebek Islak Mendil, for demonstration purposes only
head(data_4066298)
##    event_date    price sold_count visit_count basket_count favored_count
## 1: 2020-05-25 76.05423        142        2070          307            92
## 2: 2020-05-26 75.78349        169        2154          330            99
## 3: 2020-05-27 76.92846        201        2406          399           121
## 4: 2020-05-28 77.21761        176        2191          340           103
## 5: 2020-05-29 76.98344        154        2030          296            89
## 6: 2020-05-30 77.15858        162        2016          292            88
##    category_sold category_visits category_basket category_favored
## 1:           413            1149           17326             2640
## 2:           486            1369           18807             3036
## 3:           562            1562           20289             3432
## 4:           550            1482           19597             3247
## 5:           473            1273           18126             2854
## 6:           494            1220           16783             2495
##    category_brand_sold ty_visits   discount
## 1:                2461 105438700  0.0000000
## 2:                2594 105956413  0.2707342
## 3:                2731 106495403 -1.1449666
## 4:                2710 106410299 -0.2891559
## 5:                2570 105864218  0.2341721
## 6:                2608 106013149 -0.1751387
data_4066298 <- data_manip("4066298", normal = T) # Bebek Islak Mendil

After sample illustration, let’s cover the status and direction of the sold count of the data set over time. The first part of the data, which is until mid November 2020, there’s an increasing exponential structure that can be seen clearly, between mid November 2020 and January 2021, there’s an decreasing exponential structure, and starting from January 2021, there’s again an increasing exponential structure until today. The peak values are mostly the discount days of Trendyol itself, thus ignoring the peak values would lead us to a constant variance and non-zero mean. These findings are enough to take action, but it would be wise to reflect upon the correlation plot of the data set.

plotting(data_4066298, "Bebek Islak Mendil", "blue")

The correlation plot of the data set implies that sold count has higher correlation with category sold than other features. In logic, we can explain it as follows: Most of the customers are tend to use “Bebek Islak Mendil” quite a lot and it is a critical need for families that have babies or kids. Also, these items have quite a price, therefore their primal instinct guides them to search it without any brand specification. Therefore, its categorical sold value and the sold value of our product are highly correlated. Now, we can use this information as an external regressor in our SARIMA model enhanced with external regressors.

cor_plot(data_4066298, "Bebek Islak Mendil")

Hence, we construct our model with the newfound external regressor.

forecast_4066298 = forecast_func(data_4066298, cbind(data_4066298$category_sold))

As the plot illustrates, our actual values are colored with blue and the predicted values are colored with black. The predicted values are off of its actual ones, but the trajectory and the trend of our predictions are quite similar with the trend and trajectory of the actual data, thus we can say that we’ve performed more than sufficient.

load("forecast_4066298.RData")
plotting(forecast_4066298, "Bebek Islak Mendil", "blue", num = 2)

Finally, let’s check out the metrics of our model and we can say that even if we have a moderate WMAPE result, our plot shows that we’ve guessed the trajectory and trend of the data with a 1 or 2 day interval, thus we are happy with our result.

accu(forecast_4066298$sold_count, forecast_4066298$prediction)
##    n     mean       sd        CV       FBias      MAPE     RMSE      MAD
## 1 27 554.8519 447.0051 0.8056296 -0.05047382 0.4604197 538.7471 317.0189
##        MADP     WMAPE
## 1 0.5713578 0.5713578

4) Bikini Üstü (1)

After compiling necessary manipulations of data such as filling misleading values, shifting the columns for the data added later etc., moving to the deeper analysis for predictions would be wise. For further analysis, let us have a look at the data for the “bikini top” which has a product id “73318567”.

data_73318567 <- data_manip("73318567", normal = T)
head(data_73318567)
##    event_date price sold_count visit_count basket_count favored_count
## 1: 2020-05-25 59.99          0  -0.3777477   -0.4015725    -0.3378189
## 2: 2020-05-26 59.99          0  -0.3777477   -0.4015725    -0.3378189
## 3: 2020-05-27 59.99          0  -0.3777477   -0.4015725    -0.3378189
## 4: 2020-05-28 59.99          0  -0.3777477   -0.4015725    -0.3378189
## 5: 2020-05-29 59.99          0  -0.3777477   -0.4015725    -0.3378189
## 6: 2020-05-30 59.99          0  -0.3777477   -0.4015725    -0.3378189
##    category_sold category_visits category_basket category_favored
## 1:     0.2333762     -0.07330705      -0.1135866       -0.1146999
## 2:     0.1188271     -0.12624283      -0.2017907       -0.2037681
## 3:    -0.1836792     -0.39391809      -0.5062805       -0.5112390
## 4:    -0.2853214     -0.44985023      -0.4969100       -0.5017755
## 5:    -0.0868772     -0.28904532      -0.3278500       -0.3310614
## 6:     0.1640014     -0.09727797      -0.1744105       -0.1761199
##    category_brand_sold   ty_visits discount
## 1:          0.21447408  0.12917094        0
## 2:          0.10920464  0.06576939        0
## 3:         -0.16881841 -0.10166426        0
## 4:         -0.26221563 -0.15792198        0
## 5:         -0.07983192 -0.04808551        0
## 6:          0.15070226  0.09077285        0

To provide a forecast model, observing data over time is very useful.

plotting(data_73318567, "Bikini Üstü (1)", "red")

Here, each column of the data is normalized since working with normalized data is better to minimize their weighted correlation.
As you can observe from the plot, the sales over time are very different from each other. There are not any sales nearly until February 2021. The reason behind that could be the bikini top is not for the sale in that time frame. After February 2021 the sales are increased and decreased instantly, and again there are no sales. The reason behind that also could be the same.
Later, when the summer hits, the sales increased very quick which may be occurred because of the change in price or increased stocks in our product. In order to become a good prediction model, the model needs some regressors to predict next sales. The regressors are reachable from correlations matrix.

cor_plot(data_73318567, "Bikini Üstü (1)")

From this table, it is clear that “sold_count” is mostly correlated with “basket_count” and the second mostly with “visit_count” same as the correlations for the “coat”. However, “basket_count” and “visit_count” are highly correlated which is a danger sign for us about not to use both of them. As a result, the model will only use “basket_count” column to predict further sales.

forecast_73318567 = forecast_func(data_73318567, cbind(data_73318567$basket_count))

To understand the property of our forecasting model, let R plot the actual and forecasted values.

load("forecast_73318567.RData")
plotting(forecast_73318567, "Bikini Üstü (1)", "red", num = 2)

The forecast are seems really good actually, also the trend is fitted again.
For a clear last step, let us check our “accu” function which controls accuracy of prediction model.

accu(forecast_73318567$sold_count, forecast_73318567$prediction)
##    n     mean       sd        CV      FBias      MAPE     RMSE      MAD
## 1 27 54.07407 38.72678 0.7161802 -0.3309904 0.6379742 38.33733 28.30561
##        MADP     WMAPE
## 1 0.5234599 0.5234599

The forecast model seems accurate enough if the MADP and WMAPE are considered.

5) Bikini Üstü (2)

The data is imported.

data_32737302 <- data_manip("32737302", normal = F)#Bikini Üstü
head(data_32737302)
##    event_date price sold_count visit_count basket_count favored_count
## 1: 2020-05-25 59.99         40        3191          211           385
## 2: 2020-05-26 59.99         44        3191          211           385
## 3: 2020-05-27 59.99         33        2183          141           241
## 4: 2020-05-28 59.99         33        2313          150           259
## 5: 2020-05-29 59.99         33        2471          161           282
## 6: 2020-05-30 59.99         39        2586          169           298
##    category_sold category_visits category_basket category_favored
## 1:          1365            1806          274764             9738
## 2:          1223            1700          246892             8778
## 3:           848            1164          150675             5464
## 4:           722            1052          153636             5566
## 5:           968            1374          207058             7406
## 6:          1279            1758          255544             9076
##    category_brand_sold ty_visits discount
## 1:               38084 112754071        0
## 2:               35220 111701576        0
## 3:               27656 108922100        0
## 4:               25115 107988196        0
## 5:               30077 109811532        0
## 6:               36349 112116645        0

Now, the number of sales will be plotted.

data_32737302 <- data_manip("32737302")#Bikini Üstü
plotting(data_32737302, "Bikini Üstü (2)", "green")

It can be said that there is almost no sale between August 2020 and March 2021, which is reasonable because people do not tend to buy bikini in Winter. Moreover, there is an upward trend in sales after April 2021 because people tend to prepare to the holiday period during Spring. Like the other products, daily number of sales always fluctuates.

Now, the correlation between features will be explored.

cor_plot(data_32737302, "Bikini Üstü (2)")

According to the correlogram above, the number of products added into the basket has the highest correlation with the number of sales. Since most of the features are correlated to each other, adding other features in addition to basket_count violates the regression assumption, which is the regressors must independent. Therefore, basket_count will be added into the model as a sole feature.

forecast_32737302 = forecast_func(data_32737302, cbind(data_32737302$basket_count))

The comparison of actual values and predictions between 01.06.2021 and 27.06.2021 is displayed below.

load("forecast_32737302.RData")
plotting(forecast_32737302, "Bikini Üstü (2)", "green", num = 2)

Although the model cannot catch the unnatural sales quickly, it can be said that it gives close predictions to the actual sales. Error rates will be analyzed in order to interpret the results more properly.

accu(forecast_32737302$sold_count, forecast_32737302$prediction)
##    n     mean       sd        CV      FBias      MAPE     RMSE      MAD
## 1 27 61.59259 20.70537 0.3361665 0.05147037 0.2784189 24.06486 16.89914
##        MADP     WMAPE
## 1 0.2743696 0.2743696

Weighted mean absolute percentage error (WMAPE) is 0.27, which is not high. Our model seems to perform well.

6) Yüz Temizleyici

The data is imported.

data_85004 <- data_manip("85004", normal = F)#Bikini Üstü
head(data_85004)
##    event_date    price sold_count visit_count basket_count favored_count
## 1: 2020-05-25 79.34083         36        2275          233           450
## 2: 2020-05-26 78.52238         42        2570          282           509
## 3: 2020-05-27 79.81943         35        2245          228           444
## 4: 2020-05-28 80.95862         29        1884          168           371
## 5: 2020-05-29 80.54769         39        1848          162           364
## 6: 2020-05-30 80.28000         38        1878          167           370
##    category_sold category_visits category_basket category_favored
## 1:           260            4039          149496            16032
## 2:           303            4258          151516            16397
## 3:           246            3733          142893            14839
## 4:           311            3941          146496            15490
## 5:           342            4058          146081            15415
## 6:           355            4333          147819            15729
##    category_brand_sold ty_visits   discount
## 1:               24614  98583816  0.0000000
## 2:               25993 101280102  0.8184524
## 3:               24165  97705955 -1.2970476
## 4:               26250 101781737 -1.1391921
## 5:               27244 103725571  0.4109284
## 6:               27661 104540727  0.2676923

Now, the number of sales will be displayed.

data_85004 <- data_manip("85004")#Bikini Üstü
plotting(data_85004, "Yüz Temizleyici", "blue")

The number of sales for “Yüz Temizleyici” seems stationary, but there is a slight increase in sales in general after January 2021. There are some outliers due to probably special discounts. It is hard to predict the sales due to these randomness.

Now, the correlation between features will be explored.

cor_plot(data_85004, "Yüz Temizleyici")

The highest correlation with the number of sales belongs to the number of sales in the same category, which is reasonable because people generally tend to choose a “Yüz Temizleyici” among the similar products. Since price and discounts are not significantly correlated with the number of sales, only category_sold will be added into the model.

forecast_85004 = forecast_func(data_85004, cbind(data_85004$category_sold))

The comparison between actual sales and predictions is demonstrated below.

load("forecast_85004.RData")

plotting(forecast_85004, "Yüz Temizleyici", "blue", num = 2)

It can be said that the model seems to fit well. At some points, there are bizarre predictions, but in general, it seems good.

The error percentages are below.

accu(forecast_85004$sold_count, forecast_85004$prediction)
##    n    mean       sd        CV       FBias      MAPE     RMSE      MAD
## 1 27 75.2963 18.23946 0.2422359 -0.04058475 0.2457908 25.19291 18.69379
##        MADP     WMAPE
## 1 0.2482698 0.2482698

WMAPE is 0.248, which is very good compared to the predictions for the other products. The reason is that the number of sales shows a stationary behavior.

7) Şarj Edilebilir Diş Fırçası

Our another product is the one with the id “32939029”, which is “Şarj Edilebilir Diş Fırçası”, and it was one of the products that did not require intense feature engineering. Yet, we do not like leaving tasks to chance, thus we’ve also applied the same feature engineering methods to this data set. And we’ve shown the 5 instances of our data set to be understood by the readers.

data_32939029 <- data_manip("32939029", normal = F) # Şarj Edilebilir Diş Fırçası, for demonstration purposes only
head(data_32939029)
##    event_date    price sold_count visit_count basket_count favored_count
## 1: 2020-05-25 112.9000         74        3113          323           369
## 2: 2020-05-26 115.8495        101        3704          411           451
## 3: 2020-05-27 114.1078        103        3617          398           439
## 4: 2020-05-28 115.1035         84        3428          370           413
## 5: 2020-05-29 126.1038         52        2669          257           309
## 6: 2020-05-30 127.5000         38        2179          184           241
##    category_sold category_visits category_basket category_favored
## 1:          1193            1231           48380             4132
## 2:          1351            1419           52014             4647
## 3:          1071            1125           46574             3876
## 4:           927             978           42764             3336
## 5:           810             851           39533             2878
## 6:           842             890           41007             3087
##    category_brand_sold ty_visits    discount
## 1:                6411 119209111   0.0000000
## 2:                7222 122676139  -2.9495050
## 3:                5785 116532038   1.7417380
## 4:                5046 113372216  -0.9956854
## 5:                4445 110804859 -11.0003938
## 6:                4610 111507042  -1.3961538
data_32939029 <- data_manip("32939029", normal = T) # Şarj Edilebilir Diş Fırçası

Now, let’s dive deep into our data set and plot the change of the sold count value over time. We can say that the data itself has no obvious exponential trend, thus claiming the fact that the data set can be decomposed with an additive approach can be correct. The trend of the data set shows small increases in the first half, and there’s a peak point at the mid May 2021, and the remaining part is based on a decreasing trend. Furthermore, it is time to elaborate the correlation plot of the data set.

plotting(data_32939029, "Şarj Edilebilir Diş Fırçası", "blue")

Behold, the correlation plot of the data set “Şarj Edilebilir Diş Fırçası”. This plot shows that our sold count values are highly correlated with the basket count values. It is not faulty to state that the price of “Şarj Edilebilir Diş Fırçası” is not that low and if one aims to have one, they mostly add the product in their shopping basket in Trendyol. Hence, we would like to select basket count as our external regressor to create our model.

cor_plot(data_32939029, "Şarj Edilebilir Diş Fırçası")

Then, we’re here to instantiate our SARIMA model with the decided extended regressor, “basket count”.

forecast_32939029 = forecast_func(data_32939029, cbind(data_32939029$basket_count))

After the model training and constructing the predictions, let’s plot our findings with their actual values. As usual, we’ve nearly have an exact structure of the actual values as our predictions, but they are shifted either one day or two days. Aside from that, we’ve predicted the trajectory and trend of the data set and come up with the appropriate predictions. We’re happy with the output, but it would be wise for one to compare their output with the metrics of the constructed model.

load("forecast_32939029.RData")
plotting(forecast_32939029, "Şarj Edilebilir Diş Fırçası", "blue", num = 2)

Our model metrics are shown below. WMAPE is the metric that we consider as the most important, thus having 0.44 as the WMAPE result is not that great. However, it is better than the most, thus we are satisfied with our findings and our model.

accu(forecast_32939029$sold_count, forecast_32939029$prediction)
##    n    mean      sd        CV       FBias      MAPE     RMSE      MAD
## 1 27 128.037 74.5378 0.5821581 -0.01416743 0.4564989 76.82305 56.43579
##        MADP     WMAPE
## 1 0.4407771 0.4407771

8) Mont

After compiling necessary manipulations of data such as filling misleading values, shifting the columns for the data added later etc., moving to the deeper analysis for predictions would be wise. For further analysis, let us have a look at the data for the “coat” which has a product id “48740784”.

data_48740784 <- data_manip("48740784", normal = F) #for head(data) function only.
head(data_48740784)
##    event_date  price sold_count visit_count basket_count favored_count
## 1: 2020-05-25 833.32          0          18            0             1
## 2: 2020-05-26 833.32          0          18            0             1
## 3: 2020-05-27 833.32          0          18            0             1
## 4: 2020-05-28 833.32          0          18            0             1
## 5: 2020-05-29 833.32          0          18            0             1
## 6: 2020-05-30 833.32          0          18            0             1
##    category_sold category_visits category_basket category_favored
## 1:            15            1002          248050             6866
## 2:            14            1091          257980             7201
## 3:            16            1036          232193             6331
## 4:            17             846          213727             5708
## 5:            21             932          207028             5482
## 6:            15             778          200893             5275
##    category_brand_sold ty_visits discount
## 1:               33713 116610408        0
## 2:               32590 116586887        0
## 3:               34837 116633930        0
## 4:               35960 116657451        0
## 5:               40453 116751537        0
## 6:               33713 116610408        0
data_48740784 <- data_manip("48740784", normal = T)

To provide a forecast model, observing data over time is very useful.

plotting(data_48740784, "Mont", "orange")

Here, each column of the data is normalized since working with normalized data is better to minimize their weighted correlation. As you can observe, for some seasons such as Summer of 2020, January of 2021 or Spring of 2021 there are not any sales at all, which leads us there may be a kind of a seasonality or there may be a different effects such as price. To understand what affects our daily sales, it is necessary that to control over correlations between different properties. It is predictable that the data is correlated with other columns that are provided by “trendyol”. In order to become a good prediction model, the model needs some regressors to predict next sales. To select proper regressor, usage of correlation matrix would be wise.

cor_plot(data_48740784, "Mont")

From this table, it is reachable that “sold_count” is mostly correlated with “basket_count” and the second mostly with “visit_count”. However, “basket_count” and “visit_count” are highly correlated which warns us to use not both of them. As a result, the model will only use “basket_count” column to predict further sales.
To understand the correlation, it is obvious that to complete a sale, the customer has to add the product to the basket.

forecast_48740784 = forecast_func(data_48740784, cbind(data_48740784$basket_count))
load("forecast_48740784.RData")
plotting(forecast_48740784, "Mont", "orange", num = 2)

The forecast model seems predicted sales fair average. It can be said that trend is correct but there are some different oscillations from model predictions.
For a clear last step let us check our “accu” function which controls accuracy of prediction model.

accu(forecast_48740784$sold_count, forecast_48740784$prediction)
##    n     mean       sd        CV       FBias MAPE     RMSE      MAD      MADP
## 1 27 2.111111 1.739437 0.8239438 -0.09132548  Inf 1.865766 1.383203 0.6552014
##       WMAPE
## 1 0.6552014

For a prediction model that predicts different numbers between 0 and 6, accuracy of our model seems fair enough, which leads us to calculations of the other products.

9) Tayt

After compiling necessary manipulations of data such as filling values, shifting the columns for the data added later etc., moving to the deeper analysis for predictions would be wise. For further analysis, let us have a look at the data for the “sports tights” which has a product id “31515569”.

data_31515569 <- data_manip("31515569", normal = T)
head(data_31515569)
##    event_date price sold_count visit_count basket_count favored_count
## 1: 2020-05-25 44.99        610  -0.1327961   -0.1350916    -0.1260358
## 2: 2020-05-26 44.99        437  -0.4681572   -0.4763333    -0.4435424
## 3: 2020-05-27 44.99        270  -0.5941980   -0.6045707    -0.5625266
## 4: 2020-05-28 44.99        366  -0.4503018   -0.4581301    -0.4267294
## 5: 2020-05-29 44.99       1188   0.6408465    0.6519924     0.6066221
## 6: 2020-05-30 44.99       1162   0.4434756    0.4512140     0.4197394
##    category_sold category_visits category_basket category_favored
## 1:   -0.36034958     -0.22481240      -0.2107577       -0.2137058
## 2:   -0.43794209     -0.07987272      -0.1781174       -0.1806089
## 3:   -0.53479516     -0.44153502      -0.6061420       -0.6146205
## 4:   -0.50232737     -0.38915752      -0.4801228       -0.4868387
## 5:   -0.09235273     -0.33763868      -0.4385379       -0.4446722
## 6:   -0.12041811     -0.09034822      -0.1600886       -0.1623272
##    category_brand_sold   ty_visits discount
## 1:         -0.27800293 -0.28163264        0
## 2:         -0.33787195 -0.34227536        0
## 3:         -0.41257733 -0.41797126        0
## 4:         -0.38750099 -0.39259595        0
## 5:         -0.07122497 -0.07217866        0
## 6:         -0.09286858 -0.09411326        0

To provide a forecast model, observing data over time is very useful.

plotting(data_31515569, "Tayt", "purple")

Here, each column of the data is normalized since working with normalized data is better to minimize their weighted correlation.
From the plot it can be observed that there are much variation in the series. The sales follows a general trend over time. However there are various instant increases in the sales like the one in the “Black Friday”. The reason behind that could be a decrease in price. The following calculations will helps us to reach whether this assumption is correct. In order to become a good prediction model, the model needs some regressors to predict next sales. The regressors are reachable from correlations matrix.

cor_plot(data_31515569, "Tayt")

From this correlation matrix, it is reachable that “sold_count” is mostly correlated with “category_sold”, which is completely different from previous products. Also, the most second correlated column with “sold_count” is “category_visits”. However, “category_sold” and “category_visits” are highly correlated which warns us to use not both of them.
For this product, there is another important correlation. The correlation between price and sold_count is quite high relative to previous ones. Hence, the “price” column is also added to the regressors with column named “category_sold”. As a result, the model will use “category_sold” and “price” columns to predict further sales.

forecast_31515569 = forecast_func(data_31515569, cbind(data_31515569$category_sold,data_31515569$price))

To understand the property of our forecasting model, let R plot the actual and forecasted values.

load("forecast_31515569.RData")
plotting(forecast_31515569, "Tayt", "purple", num = 2)

The forecast are seems good enough, also the trend is fitted again although there is a small delay between the predictions and actual values.
For a clear last step let us check our “accu” function which controls accuracy of prediction model.

accu(forecast_31515569$sold_count, forecast_31515569$prediction)
##    n     mean       sd        CV      FBias     MAPE     RMSE      MAD
## 1 27 325.2593 190.3023 0.5850788 0.03214571 1.006609 331.1772 265.7523
##        MADP     WMAPE
## 1 0.8170477 0.8170477

For a product that has a sold_count between approximately 0 and 10500, it is hard to catch a significantly accurate model since the oscilliations are not seasonal. For this product, maybe usage of a binary variable that will control if it is special day such as “Black Friday” for special discounts would be wise.

Conclusion

Future Work